#!/usr/bin/env perl

# Copyright (C) Yichun Zhang (agentzh)

# Thanks Brendan Gregg for the inspiration given here:
#   http://dtrace.org/blogs/brendan/2011/07/08/off-cpu-performance-analysis/

use 5.006001;
use strict;
use warnings;

use Getopt::Long qw( GetOptions );

GetOptions("a=s",       \(my $stap_args),
           "d",         \(my $dump_src),
           "distr",     \(my $check_distr),
           "h",         \(my $help),
           "k",         \(my $check_kernel),
           "l=i",       \(my $limit),
           "min=i",     \(my $min_elapsed),
           "p=i",       \(my $pid),
           "t=i",       \(my $time),
           "u",         \(my $check_user))
    or die usage();

if ($help) {
    print usage();
    exit;
}

if (!defined $min_elapsed) {
    $min_elapsed = 4;
}

if (!defined $pid) {
    die "No process pid specified by the -p option.\n";
}

my $condition = "pid() == target()";

if (!defined $time) {
    die "No -t <seconds> option specified.\n";
}

if (!defined $limit) {
    $limit = 1024;
}

if (!defined $stap_args) {
    $stap_args = '';
}

if ($stap_args !~ /(?:^|\s)-D\s*MAXACTION=/) {
    $stap_args .= " -DMAXACTION=100000";
}

if ($stap_args !~ /(?:^|\s)-D\s*MAXMAPENTRIES=/) {
    $stap_args .= " -DMAXMAPENTRIES=3000";
}

if ($stap_args !~ /(?:^|\s)-D\s*MAXBACKTRACE=/) {
    $stap_args .= " -DMAXBACKTRACE=200";
}

if ($stap_args !~ /(?:^|\s)-D\s*MAXSTRINGLEN=2048/) {
    $stap_args .= " -DMAXSTRINGLEN=2048";
}

$stap_args .= " -DSTP_NO_OVERLOAD";

if ($^O ne 'linux') {
    die "Only linux is supported but I am on $^O.\n";
}

my $exec_file = "/proc/$pid/exe";
if (!-f $exec_file) {
    die "User process $pid is not running or ",
        "you do not have enough permissions.\n";
}

my $exec_path = readlink $exec_file;

my $ver = `stap --version 2>&1`;
if (!defined $ver) {
    die "Systemtap not installed or its \"stap\" utility is not visible to the PATH environment: $!\n";
}

if ($ver =~ /version\s+(\d+\.\d+)/i) {
    my $v = $1;
    if ($v < 2.1) {
        die "ERROR: at least systemtap 2.1 is required but found $v\n";
    }

} else {
    die "ERROR: unknown version of systemtap:\n$ver\n";
}

my $preamble = <<_EOC_;
global quit = 0;
global found

probe begin {
    warn(sprintf("Tracing %d ($exec_path)...\\n", target()))
}
_EOC_

my $postamble = <<_EOC_;
probe timer.s($time) {
    if (!found) {
        warn("No backtraces found. Quitting now...\\n")
        exit()

    } else {
        warn("Time's up. Quitting now...(it may take a while)\\n")
        quit = 1
    }
}
_EOC_

my $stap_src;
my $d_so_args = '';

if ($check_distr) {
    $stap_src = <<_EOC_;
global start_time
global elapsed_times

$preamble

probe end {
    if (!found) {
        println("\\nNo samples found yet.")

    } else {
        println("=== Off-CPU time distribution (in us) ===")
        printf("min/avg/max: %d/%d/%d\\n",
               \@min(elapsed_times), \@avg(elapsed_times), \@max(elapsed_times))
        println(\@hist_log(elapsed_times))
    }
}

probe scheduler.cpu_off {
    if ($condition) {
        if (!quit) {
            start_time[tid()] = gettimeofday_us()

        } else {
            exit()
        }
    }
}

probe scheduler.cpu_on {
    if ($condition && !quit) {
        t = tid()
        begin = start_time[t]
        if (begin > 0) {
            elapsed = gettimeofday_us() - begin
            if (elapsed >= $min_elapsed) {
                found = 1
                elapsed_times <<< elapsed
            }
            delete start_time[t]
        }
    }
}

probe timer.s($time) {
    println("Exiting...Please wait...")
    quit = 1
}
_EOC_

} else {
    # for sampling backtraces

    my ($gen_bt_code, $print_bt_code);

    if ($check_kernel) {
        if ($check_user) {
            $gen_bt_code = "backtrace(), ubacktrace()";
            $print_bt_code = <<_EOC_;
            foreach ([kbt, ubt] in bts- limit $limit) {
                print_stack(kbt)
                print_ustack(ubt)
                printf("\\t%d\\n", \@sum(bts[kbt, ubt]))
            }
_EOC_

        } else {
            $gen_bt_code = "backtrace()";
            $print_bt_code = <<_EOC_;
            foreach (bt in bts- limit $limit) {
                print_stack(bt)
                printf("\\t%d\\n", \@sum(bts[bt]))
            }
_EOC_
        }

    } else {
        $gen_bt_code = "ubacktrace()";
        $print_bt_code = <<_EOC_;
            foreach (bt in bts- limit $limit) {
                print_ustack(bt)
                printf("\\t%d\\n", \@sum(bts[bt]))
            }
_EOC_
    }

    $stap_src = <<_EOC_;
global bts
global start_time

$preamble

probe scheduler.cpu_off {
    if ($condition) {
        if (!quit) {
            start_time[tid()] = gettimeofday_us()

        } else {
$print_bt_code
            exit()
        }
    }
}

probe scheduler.cpu_on {
    if ($condition && !quit) {
        t = tid()
        begin = start_time[t]
        if (begin > 0) {
            elapsed = gettimeofday_us() - begin
            if (elapsed >= $min_elapsed) {
                bts[$gen_bt_code] <<< elapsed
                found = 1
            }
            delete start_time[t]
        }
    }
}

$postamble
_EOC_

    my %so_files;
    {
        my $maps_file = "/proc/$pid/maps";
        open my $in, $maps_file
            or die "Cannot open $maps_file for reading: $!\n";

        while (<$in>) {
            if (/\s+(\/\S+\.so(?:\.\S+)?)$/) {
                if (!exists $so_files{$1}) {
                    $so_files{$1} = 1;
                    #warn $1, "\n";
                }
            }
        }

        close $in;
    }

    if (%so_files) {
        $d_so_args = join " ", map { "-d $_" } sort keys %so_files;
    }
}

if ($dump_src) {
    print $stap_src;
    exit;
}


#warn "$d_so_args\n";

open my $in, "|stap --skip-badvars --all-modules -x $pid -d '$exec_path' --ldd $d_so_args $stap_args -"
    or die "Cannot run stap: $!\n";

print $in $stap_src;

close $in;

sub usage {
    return <<'_EOC_';
Usage:
    sample-bt-off-cpu [optoins]

Options:
    -a <args>           Pass extra arguments to the stap utility.
    -d                  Dump out the systemtap script source.
    --distr             Analyze the distribution of the elapsed off-CPU times only.
    -h                  Print this usage.
    -k                  Analyze kernelspace backtraces.
    -l <count>          Only output <count> most frenquent backtrace samples.
                        (Default to 1024)
    --min=<us>          Minimal elapsed off-CPU time to be tracked.
                        (Default to 4us)
    -p <pid>            Specify the user process pid.
    -t <seconds>        Specify the number of seconds for sampling.
    -u                  Analyze userspace backtraces.

Examples:
    sample-bt-off-cpu -p 12345 -t 10
    sample-bt-off-cpu -p 12345 -t 5 -a '-DMAXACTION=100000'
    sample-bt-off-cpu --distr -p 12345 -t 10 --min=1
_EOC_
}
